# Output:
# - the sequence file required by DTM (data-seq.dat)
# - the word count file required by DTM (data-mult.dat)
# - a word-ID table (word_ids.dat)
# - a tab delimited file with the meta data supplied with the raw speeches (meta_data.dat)

# Note:
# - R version 3.4.0 (2017-04-21)
# - quanteda_0.9.9-50

rm(list = ls(all = TRUE))

library(quanteda)
library(tm)

### PATHS ##############################
dataIn <- paste0("./data/Hansard_speeches_1935-2014.tab")
stopwordsFile <- "./data/MySQL_stopword_list.txt"
dataOutDfm <- paste0("./data/dfm.RData")
dataOutWordIDs <- paste0("./data/word_ids.dat")
dataOutMetaData <- paste0("./data/meta_data.dat")
dataOutSeq <- paste0("./data/data-seq.dat")
dataOutMult <- paste0("./data/data-mult.dat")
########################################

### FUNCTIONS ##########################
# Function to collapse a vector into the DTM file structure
# with 'number.unique.words id:word.count id:word.count ...
vector2dtm <- function(dfm.row) {
    df <- data.frame(id=as.numeric(colnames(dfm.row)), freq=as.numeric(dfm.row))
    df <- df[df$freq>0,]
    df <- df[order(df$id),]
    row <- paste(nrow(df),paste(df$id, df$freq, sep=":", collapse=" "))
    return(row)
}
########################################

# open log file
sink("1-generate_DTM_file_structure.log", append=FALSE, split=TRUE) # do not append, show output in both file and terminal

# print package versions
cat("* Session info\n")
print(sessionInfo())

# print paths
cat("\n* Output folders and files:\n")
print(dataIn)
print(stopwordsFile)
print(dataOutDfm)
print(dataOutWordIDs)
print(dataOutMetaData)
print(dataOutSeq)
print(dataOutMult)

# read data
cat("\n* Read raw texts\n")
data <- read.delim(dataIn, header=FALSE, stringsAsFactors=FALSE, quote="", sep="\t")
N.docs <- nrow(data)
cat(paste("  ... Read", N.docs, "documents\n"))

# column labels
names(data) <- c("sessionID", "memberID", "speechID", "date", "speech")

# unique ID for sessions and members
data$groupID <- paste(data$sessionID,data$memberID)

# read stop words
stop.words <- read.table(stopwordsFile, stringsAsFactors=FALSE)
stop.words <- stop.words$V1

# clean texts
cat("\n* Clean texts\n")

# remove numbers and special characters
cat("  ... Remove numbers\n")
data$speech <- gsub("[0-9]", "", data$speech)
data$speech <- gsub("#", " ", data$speech)
data$speech <- gsub("á", "a", data$speech)
data$speech <- gsub("à", "a", data$speech)
data$speech <- gsub("â", "a", data$speech)
data$speech <- gsub("ã", "a", data$speech)
data$speech <- gsub("é", "e", data$speech)

# replace {m,n}-dash with regular dash
cat("  ... Replace {m,n}-dash with regular dash\n")
data$speech <- gsub("–", "-", data$speech)
data$speech <- gsub("—", "-", data$speech)

# generate corpus object
cat("\n* Generate corpus\n")
myCorpus <- corpus(data$speech)
docvars(myCorpus, "sessionID") <- data$sessionID
docvars(myCorpus, "memberID") <- data$memberID
docvars(myCorpus, "speechID") <- data$speechID
docvars(myCorpus, "date") <- data$date
docvars(myCorpus, "groupID") <- data$groupID

# remove speeches and redundant information from meta data to free up memory
data <- data[, names(data)!="speech"]
data <- data[!duplicated(data$groupID),]
invisible(gc())

# generate DFM
cat("\n* Generate DFM\n")
ptm <- proc.time()

custom.word.list <- c("hon", "mr", "member", "members", "bill", "minister", "prime", "government", "governments", "friend", "year", "years", "gentleman", "gentlemen") 

myDfm <- dfm(myCorpus,
             tolower = TRUE,
             remove = c(stop.words, custom.word.list),
             stem = TRUE,
             remove_punct = TRUE,
             remove_numbers = TRUE,
             remove_symbols = TRUE,
             remove_separators = TRUE,
             remove_hyphens = TRUE,
             groups = "groupID")
elapsed <- proc.time() - ptm
cat(paste("  ... Time to generate dfm:",round(elapsed[3],2),"seconds\n"))
cat(paste("  ... Number of documents before trimming:", dim(myDfm)[1],"\n"))
cat(paste("  ... Number of unique words before trimming:", dim(myDfm)[2],"\n"))

# trimming
cat("\n* Trimming\n")
ptm <- proc.time()

min.count <- 50
min.doc <- 10

myDfm <- dfm_trim(myDfm, min_count = 50, min_docfreq = 5)

# remove features with nchar=1
delete <- which(nchar(colnames(myDfm))==1)
myDfm <- myDfm[, -delete]

# remove documents with zero words after trimming from DFM and meta data
delete <- names(which(rowSums(myDfm)==0))
myDfm <- myDfm[!(rownames(myDfm) %in% delete),]
data <- data[!(data$groupID %in% delete),]

elapsed <- proc.time() - ptm
cat(paste0("  ... Time to trim dfm with min count ", min.count, " and min docs ", min.doc,": ",round(elapsed[3],2)," seconds\n"))

cat(paste("  ... Number of documents after trimming:", dim(myDfm)[1],"\n"))
cat(paste("  ... Number of unique words after trimming:", dim(myDfm)[2],"\n"))

# save full DFM
save(myDfm, file=dataOutDfm)

# save meta data
write.table(data,
            file=dataOutMetaData,
            col.names=TRUE,
            row.names=FALSE,            
            quote=FALSE, sep="\t")


# generate word ID table
cat("\n* Generate word ID table \n")

words <- colSums(myDfm)
words.data <- data.frame(word=names(words),
                       freq=as.numeric(words),
                         stringsAsFactors=FALSE)
words.data <- words.data[order(words.data$word),]
words.data$id <- seq(0, nrow(words.data)-1)

write.table(words.data,
            file=dataOutWordIDs,
            col.names=FALSE,
            row.names=FALSE,
            sep="\t")

# free memory
rm(words)
invisible(gc())

# generate sequence file for DTM
cat("\n* Generate sequence file for DTM \n")

# Note: This file contains as first entry the number of sessions and in each following row the total number of documents in each session
seq <- as.numeric(table(data$sessionID))
seq.data <- c(length(seq), seq)
write.table(seq.data, file=dataOutSeq, row.names=FALSE, col.names=FALSE)

# free memory
rm(data, seq, seq.data)
invisible(gc())

# generate word-count file for DTM
cat("\n* Generate word-count file for DTM \n")

# replace col names in DFM with word IDs
for (word in words.data$word) {
    colnames(myDfm)[colnames(myDfm)==word] <- words.data$id[words.data$word==word]
}


# free memory
rm(words.data)
invisible(gc())


# delete existing output file because rows are being appended
if (file.exists(dataOutMult)) {
    file.remove(dataOutMult)
    cat("  ... Existing file found and deleted\n")
}

N <- nrow(myDfm)

out <- list()
for (i in 1:N) {
    cat(paste("  ... Processed", i, "of", N, "docs\n"))
    out[[i]] <- vector2dtm(myDfm[i,])
}

lapply(out, function(x) cat(paste0(x,"\n"), file=dataOutMult, append=TRUE))

sink()
